In [275]:
import pickle;
from imp import reload
import numpy as np; import pandas as pd
import lightgbm as lgb; import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import constants, utils, inference, evaluation
from joblib import Parallel, delayed
import multiprocessing
import os
pd.options.mode.chained_assignment = None

Load Evaluation Data


In [188]:
data = pd.read_hdf('/data/Instacart/eval.h5')
orders =data[['order_id']].drop_duplicates()
gid = data[constants.ID_COLS]
label = data['label']

In [3]:
def generate_pred(m, idx, is_sub=False):
    '''
        m: model path
        idx: index of model
        is_sub: bool indicator for submission
    '''
    print('Evaluating Model {} ...'.format(idx))
    print('Model Path {}'.format(m))
    bst = pickle.load(open(m, 'rb'))
    feat = data[utils.get_feat_col(bst)] # data global variable
    pred = utils.get_predition(bst, feat)

    user_product = gid[['user_id', 'product_id', 'order_id']] # gid global variable
    user_product['score'] = pred

    if is_sub is False:
        user_product['label'] = label
        auc = roc_auc_score(label, pred) # label global variable
        print('Evaluation AUC {}'.format(auc))
        op = user_product.copy()
        op = utils.tarbox_f1_optim(op, low_bound=0)
        op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])
        op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
                    op, on = ['order_id'], how = 'left')
        gold = evaluation.get_gold(user_product) 
        res = evaluation.evaluation(gold, op[['order_id', 'products']])
        mf1 = res.f1score.mean()
        print('F1 Optimization Result: mean-f1-score {}'.format(mf1))
        eval_res=  {'model_file':m.split('/')[-1], 'eval_auc': auc, 'eval_mf1': mf1}
        return eval_res, pred
    else:
        return pred

In [327]:
%%time
eval_preds_8 = []
eval_infos_8 = []
for i,m in enumerate(constants.MODEL_PATH_8):
    infos, preds = generate_pred(m, i, False)
    # preds = generate_pred(m, i, True)
    eval_preds_8.append(preds)
    eval_infos_8.append(infos)
    # eval_infos_5.append({'model_file':m.split('/')[-1], 'eval_auc': 0.840101, 'eval_mf1': 0.404459})


Evaluating Model 0 ...
Model Path /home/public/Instacart/lgb/lgb_train_vip_gbdt_0.840647
Evaluation AUC 0.8394667149905357
F1 Optimization Result: mean-f1-score 0.4035948872923294
CPU times: user 8min 4s, sys: 9.51 s, total: 8min 14s
Wall time: 2min 54s

bagging median


In [354]:
big_lgb_dart = pd.read_csv('./submission/eval_big_lgb_dart_0.8386003614599506.csv')

In [355]:
user_product = gid[['user_id', 'product_id', 'order_id']]
user_product['label'] = label
user_product['score'] = big_lgb_dart.score.values
gold = evaluation.get_gold(user_product) 

op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])

op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
                    op, on = ['order_id'], how = 'left')

res = evaluation.evaluation(gold, op[['order_id', 'products']])

In [356]:
print('F1 Optimization Result: mean-f1-score {}'.format(res.f1score.mean()))


F1 Optimization Result: mean-f1-score 0.40111937528766634

In [328]:
eval_infos_8 = pd.DataFrame(eval_infos_8)

In [329]:
bagging_tree = pd.concat([eval_infos_2, eval_infos_4, 
                          eval_infos.iloc[[22, 16, 15]], 
                          eval_infos_3, eval_infos_5,
                          eval_infos_6, eval_infos_7,
                          eval_infos_8], axis =0)

In [330]:
def patch(x):
    if x.startswith('xgb'):
        return '/home/public/Instacart/xgb/' + x
    else:
        return '/home/public/Instacart/lgb/' + x
bagging_tree['model_file'] = bagging_tree['model_file'].apply(patch)

In [331]:
bagging_tree.to_hdf(constants.EVA_DATA_DIR + 'bagging_tree.h5', 'bagging', mode = 'w')

In [332]:
bagging_tree = bagging_tree.sort_values('eval_mf1')

In [333]:
bagging_tree.reset_index()


Out[333]:
index eval_auc eval_mf1 model_file
0 1 0.837777 0.402564 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
1 0 0.838028 0.402665 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
2 3 0.838212 0.402834 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
3 2 0.837845 0.402956 /home/public/Instacart/lgb/lgb_mtwdr_goss_0.83...
4 1 0.839410 0.402959 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
5 0 0.839316 0.403175 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
6 15 0.839354 0.403207 /home/public/Instacart/lgb/lgb_gbdt_0.83976702...
7 0 0.838976 0.403230 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
8 16 0.839337 0.403314 /home/public/Instacart/lgb/lgb_gbdt_0.83975988...
9 22 0.839243 0.403543 /home/public/Instacart/lgb/lgb_gbdt_0.84022029...
10 1 0.839331 0.403580 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
11 0 0.839467 0.403595 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
12 0 0.839425 0.403769 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
13 1 0.839287 0.403786 /home/public/Instacart/lgb/lgb_train_vip_gbdt_...
14 0 0.839904 0.404341 /home/public/Instacart/xgb/xgb_vip_0.839904_0....
15 0 0.840101 0.404459 /home/public/Instacart/xgb/xgb_vip_gbtree_0.84...

In [334]:
pred_evals = []
for idx,m in enumerate(bagging_tree.model_file.values):
    fp = constants.EVA_DATA_DIR + m.split('/')[-1] + 'pkl'
    if os.path.exists(fp):
        pred = pickle.load(open(fp, 'rb'))
    else:
        pred = generate_pred(m, idx, is_sub=True)
        with open(constants.EVA_DATA_DIR + m.split('/')[-1] + 'pkl', 'wb') as f:
            pickle.dump(pred, f, pickle.HIGHEST_PROTOCOL)
    pred_evals.append(pred)

In [352]:
level0 = np.median(pred_evals[0:5], axis=0) # 0.4034
level1 = np.median(pred_evals[5:9], axis=0)
level2 = np.median([level0, level1] + pred_evals[9:12], axis=0) # 0.40429
level3 = np.median([level2] + pred_evals[12:14], axis=0)

In [353]:
%%time
user_product = gid[['user_id', 'product_id', 'order_id']]
user_product['label'] = label
user_product['score'] = level3
gold = evaluation.get_gold(user_product) 

op = user_product.copy()
op = utils.tarbox_f1_optim(op, low_bound=0)
op['products'] = op['products'].apply(lambda x: [int(i) if i != 'None' else i for i in x.split()])

op = pd.merge(pd.DataFrame({'order_id':user_product.order_id.unique()}),
                    op, on = ['order_id'], how = 'left')

res = evaluation.evaluation(gold, op[['order_id', 'products']])


CPU times: user 1min 57s, sys: 4.72 s, total: 2min 2s
Wall time: 2min 10s

In [350]:
print('F1 Optimization Result: mean-f1-score {}'.format(res.f1score.mean()))


F1 Optimization Result: mean-f1-score 0.40484816570025384

In [ ]:
op = user_product.copy()
op = utils.shing_f1_optim(op, low_bound=0.01, topk=200)
op = pd.merge(op[['order_id', 'products']], orders[['order_id']], on=['order_id'], how='right')
op.columns = ['order_id', 'products']
for row in op.loc[op.products.isnull(), 'products'].index:
    op.at[row, 'products'] = 'None'
op[['order_id', 'products']].to_csv('./submission/lgb3_big_bag_shing.csv', index=False)

In [ ]: